# Canen and Martin (2019)
# Robustness check: drop markets with lots of missingness in program data

library(tidyverse)
library(magrittr)
library(data.table)
library(lubridate)
library(hms)
library(ggthemes)
theme_set(theme_few())


### SET WORKING DIRECTORY HERE ###
path_to_archive <- "replication/"
data_dir <- paste0(path_to_archive, "data/")
setwd(data_dir)
plot_dir <- paste0(path_to_archive, "plots/")
tables_dir <- paste0(path_to_archive, "tables/")

### FUNCTION DEFINITIONS FOR WEIGHTING SCHEMES
diff2 <- function(x) x[seq(1,length(x)-1,2)] - x[seq(2,length(x),2)]

inv_var_weight <- function(i,d) {
	weighted.mean(d[i, est], w=sqrt(d[i, n_t] * d[i, n_c]), na.rm=T)
}

total_size_weight <- function(i, d) {
	weighted.mean(d[i,est], w=d[i,n_t] + d[i,n_c], na.rm=T)
}

simple_avg <- function(i,d){
	mean(d[i,est], na.rm=T)
}


### examine missingness in the program data
### THIS FILE IS PROPRIETARY (FROM FWM DATA)
load("news_program_events_dt.RData")
###

natl_chans <- news_schedule %>% 
	.[channel %in% c("CNN","MSNBC","FNC"),.(prog_count=.N),by=.(channel, air_date)] %>%
	.[,.(mean_progs_day = mean(prog_count), days_covered = .N), by = .(channel)]

# filter local channels only, count programs per channel-day
daily_progs <- news_schedule %>% as.data.table %>%
	.[grepl("^[WK][A-Z]{2}[A-Z]?(DT\\d?)?(LD)?$", channel),.(prog_count=.N),by=.(channel, air_date)]

coverage <- daily_progs[,.(mean_progs_day = mean(prog_count), days_covered = .N), by = .(channel)]
summary(coverage)
ggplot(aes(x=days_covered), data = coverage) + geom_histogram(binwidth=2)

coverage[,bad_coverage:= days_covered <= 51]

call2dma <- fread(paste0(data_dir, "callsigns.csv")) %>%
	setnames(old="chcall", new="channel")
coverage[call2dma, on="channel",nomatch=0] %>% 
	.[,.(ch_good = sum(!bad_coverage), ch_bad = sum(bad_coverage)), by=.(DMA)] %>%
	.[ch_good>0]

load(paste0(data_dir, "dma_timezone.RData"))

#  1: BLUEFIELD-BECKLEY-OAK HILL       2      0
#  2:                    BUFFALO       2      7
#  3:      CHARLESTON-HUNTINGTON      10      3
#  4:          CLARKSBURG-WESTON       1      0
#  5:           DALLAS-FT. WORTH       9      9
#  6:                    DETROIT       1      2
#  7:              OKLAHOMA CITY       1     17
#  8:                PARKERSBURG       1      0
#  9:                 PITTSBURGH       6      1
# 10:        RICHMOND-PETERSBURG       1      1
# 11:          ROANOKE-LYNCHBURG      13      2
# 12:                SHERMAN-ADA       5      0
# 13:          TRI-CITIES, TN-VA       6      5
# 14:      WHEELING-STEUBENVILLE       3      0
# 15:     WICHITA FALLS & LAWTON       1      1

good_coverage_dmas <- c(559, 514, 564, 623, 505, 650, 597, 508, 573, 657, 554, 627)
dd_wide <- readRDS(paste0(data_dir, "all_dd_data.rds"))
dd_wide <- dd_wide[dma_code %in% good_coverage_dmas]

dd_long <- dd_wide %>% 
	gather(key="hour", val="view_sum", dplyr::starts_with("s_pre"), dplyr::starts_with("s_post")) %>%
	as.data.table

# collapse to combined C group
dd_long <- dd_long[,`:=` (groupTC = if_else(group == "T", "T", "C"), n_c = n_c1 + n_c2 + n_c12)] %>%
  .[,.(view_sum = sum(view_sum)), by = .(dma_code, channel, affiliate, program, ad_time_utc, n_t, n_c, hour, group = groupTC)]



dd_long[, `:=`(view_mean = view_sum / ifelse(group=="C", n_c, n_t),
			   pre_post = sub("s_(pre|post)_(\\d+)", "\\1", hour),
			   hour = sub("s_(pre|post)_(\\d+)", "\\2", hour) %>% as.numeric)]

dd <- 
	dd_long[, .(news_min = sum(view_mean)/60), by = .(dma_code, channel, affiliate, program, ad_time_utc, n_t, n_c, group, pre_post)] %>%
	setkey(dma_code, channel, affiliate, program, ad_time_utc, n_t, n_c, group, pre_post) %>%
	.[, .(est = -diff2(diff2(.SD$news_min))) ,by = .(dma_code, channel, affiliate, program, ad_time_utc, n_t, n_c)]

# full sample point estimate, bootstrap
pt_est_full <- inv_var_weight(1:nrow(dd),dd)
set.seed(-3258)
bs_full <- map_dbl(1:1000, ~ inv_var_weight(sample.int(nrow(dd), replace=T), dd))
results_full <- data.table(replicate = 0:1000, estimate = c(pt_est_full, bs_full))

pt_est_full_simple <- simple_avg(1:nrow(dd),dd)
set.seed(-3258)
bs_full_simple <- map_dbl(1:1000, ~ simple_avg(sample.int(nrow(dd), replace=T), dd))
results_full_simple <- data.table(replicate = 0:1000, estimate = c(pt_est_full_simple, bs_full_simple))

pt_est_full_size <- total_size_weight(1:nrow(dd),dd)
set.seed(-3258)
bs_full_size <- map_dbl(1:1000, ~ total_size_weight(sample.int(nrow(dd), replace=T), dd))
results_full_size <- data.table(replicate = 0:1000, estimate = c(pt_est_full_size, bs_full_size))

### confidence intervals ###
quantiles <- c(0.01,0.025,0.975,0.99)

ci_full <- results_full[,c(point_est=estimate[replicate==0], map(quantiles, ~ quantile(estimate[replicate!=0], probs=.)) %>% set_names(paste("q",c("01","025","975","99"), sep="_")))]
ci_full_simple <- results_full_simple[,c(point_est=estimate[replicate==0], map(quantiles, ~ quantile(estimate[replicate!=0], probs=.)) %>% set_names(paste("q",c("01","025","975","99"), sep="_")))]
ci_full_size <- results_full_size[,c(point_est=estimate[replicate==0], map(quantiles, ~ quantile(estimate[replicate!=0], probs=.)) %>% set_names(paste("q",c("01","025","975","99"), sep="_")))]


# cis_all <- readRDS(file="dd_bootstrap_cis.rds")
# cis_all_simple <- readRDS(file="dd_bootstrap_cis_simple_wgt.rds")
# cis_all_size <- readRDS(file="dd_bootstrap_cis_size_wgt.rds")

library(stargazer)

### TABLE D.1
y <- rnorm(100)
x <- matrix(rnorm(100*nrow(ci_full)), nrow=100)

fake_model <- lm(y ~ x - 1)



dd_bootstrap_cis <- stargazer(list(fake_model, fake_model, fake_model),
		  title="Average Effects and Bootstrapped Confidence Intervals: Dropping DMAs with missing local news program data.",
		  style="apsr",
		  out=paste0(tables_dir, "dd_bootstrap_cis_robust_missing.tex"),
		  label="tab:dd_boostrap_cis_robust_missing",
		  coef=list(ci_full[,point_est], ci_full_size[,point_est], ci_full_simple[,point_est]), 
		  ci=T,
		  ci.custom=list(ci_full[, .(q_025, q_975)] %>% as.matrix,
		  				 ci_full_size[, .(q_025, q_975)] %>% as.matrix,
		  				 ci_full_simple[, .(q_025, q_975)] %>% as.matrix),
		  star.cutoffs = NA,
		  omit.stat = "all",
		  column.labels = c("Inv. Variance", "Total Devices", "Equally Weighted"),
		  dep.var.labels = "Effect (Minutes)",
		  covariate.labels = "Treated",
		  notes="\\parbox[t]{0.7\\linewidth}{Table reports average effects, weighted by 1) the geometric mean of treatment and control group size, 2) total devices in treatment and control groups and 3) a simple equally weighted average. Confidence intervals are the central 95\\% interval of 1000 bootstrap replicates. Sample drops ads run in DMAs with extensive missingness of the local news program schedule.}",
		  notes.append=F
		  )
